In [ ]:
from osgeo import gdal
import pandas as pd
import geopandas as gp
from matplotlib import pyplot as plt
from matplotlib import colors
import py7zr

1. Import files from Google Drive and extract them into data folder¶

In [ ]:
import gdown
# Google drive public shared folder ID
id = "1vdm8b0ewDslPVvubA8emwDYHUGtwx5pT"
gdown.download_folder(id=id,output="../data", quiet=True, use_cookies=False)
Out[ ]:
['../data\\bike_data.7z']
In [ ]:
import py7zr
# Extract files from 7zip file
with py7zr.SevenZipFile('../data/bike_data.7z', mode='r') as z:
    z.extractall("../data")

2. Read files¶

In [ ]:
df = pd.read_csv('../data/accidentsVelo.csv')
df.head()
C:\Users\a815362\AppData\Local\Temp\ipykernel_4928\3584366541.py:1: DtypeWarning: Columns (8,9,20,21,30) have mixed types. Specify dtype option on import or set low_memory=False.
  df = pd.read_csv('../data/accidentsVelo.csv')
Out[ ]:
Num_Acc date an mois jour hrmn dep com lat long ... secuexist equipement obs obsm choc manv vehiculeid typevehicules manoeuvehicules numVehicules
0 200500000030 2005-01-13 2005 janvier jeudi 19:45 62 62331 50.3 2.84 ... 0 0 0.0 2.0 8.0 11.0 200500000030B02 18 17 1.0
1 200500000034 2005-01-19 2005 janvier mercredi 10:45 62 62022 0.0 0.0 ... 0 0 0.0 2.0 1.0 1.0 200500000034B02 10 15 1.0
2 200500000078 2005-01-26 2005 janvier mercredi 13:15 02 02173 0.0 0.0 ... 1 2 0.0 2.0 1.0 1.0 200500000078B02 7 15 1.0
3 200500000093 2005-01-03 2005 janvier lundi 13:30 02 02810 49.255 3.094 ... 0 0 0.0 2.0 3.0 21.0 200500000093B02 7 21 1.0
4 200500000170 2005-01-29 2005 janvier samedi 18:30 76 76196 0.0 0.0 ... 1 9 0.0 2.0 4.0 2.0 200500000170A01 10 2 1.0

5 rows × 39 columns

Removing NA's on lat and long long columns only¶

In [ ]:
num_rows = len(df)
df = df.loc[(df["lat"].notna() & df["long"].notna())]
num_rows_not_na = len(df)
print("Removed NA's rows : %d" % (num_rows -num_rows_not_na) )
Removed NA's rows : 268

Check column types and convert if necessary¶

In [ ]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 74490 entries, 0 to 74757
Data columns (total 39 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Num_Acc          74490 non-null  int64  
 1   date             74490 non-null  object 
 2   an               74490 non-null  int64  
 3   mois             74490 non-null  object 
 4   jour             74490 non-null  object 
 5   hrmn             74490 non-null  object 
 6   dep              74490 non-null  object 
 7   com              74490 non-null  object 
 8   lat              74490 non-null  object 
 9   long             74490 non-null  object 
 10  agg              74490 non-null  int64  
 11  int              74490 non-null  int64  
 12  col              74486 non-null  float64
 13  lum              74490 non-null  int64  
 14  atm              74488 non-null  float64
 15  catr             74490 non-null  int64  
 16  circ             74347 non-null  float64
 17  nbv              74293 non-null  float64
 18  prof             74314 non-null  float64
 19  plan             74291 non-null  float64
 20  lartpc           63519 non-null  object 
 21  larrout          69399 non-null  object 
 22  surf             74317 non-null  float64
 23  infra            73955 non-null  float64
 24  situ             73997 non-null  float64
 25  grav             74490 non-null  int64  
 26  sexe             74490 non-null  int64  
 27  age              74464 non-null  float64
 28  trajet           74487 non-null  float64
 29  secuexist        74490 non-null  int64  
 30  equipement       70749 non-null  object 
 31  obs              74464 non-null  float64
 32  obsm             74449 non-null  float64
 33  choc             74478 non-null  float64
 34  manv             74474 non-null  float64
 35  vehiculeid       74490 non-null  object 
 36  typevehicules    64223 non-null  object 
 37  manoeuvehicules  64206 non-null  object 
 38  numVehicules     64223 non-null  float64
dtypes: float64(16), int64(9), object(14)
memory usage: 22.7+ MB

Convert lat and long cols to float¶

In [ ]:
df['lat'] = df['lat'].apply(lambda x: float(str(x).replace(',', '.')))
df['long'] = df['long'].apply(lambda x: float(str(x).replace(',', '.')))
In [ ]:
print("Lat and long colum type are |{}| and |{}| ".format(df.dtypes['lat'],df.dtypes['long'] ))
Lat and long colum type are |float64| and |float64| 

Checking zeros on lat and long cols and total number of cols¶

In [ ]:
count_lat = df['lat'].value_counts()[0]
count_long = df['long'].value_counts()[0]
rows = len(df)

results = pd.DataFrame(data={"lat_with_zero":[count_lat], "long_with_zero":[count_long],"Total number of rows": [rows]})
results
Out[ ]:
lat_with_zero long_with_zero Total number of rows
0 42084 43131 74490

Remove rows where lat and long columns are not 0¶

In [ ]:
df_filtered = df.loc[(df['lat'] != 0) & (df['long'] != 0)].copy()
print("Non zero coordinates data frame has %d rows" %(len(df_filtered)))
Non zero coordinates data frame has 31355 rows

Convert to a proper geodataframe¶

In [ ]:
# Convert longitude and latitude to a geometric Point object
points_gdf = gp.GeoDataFrame(df_filtered, geometry=gp.points_from_xy(df_filtered.lat, df_filtered.long))
# Convert DataFrame to GeoDataFrame
points_gdf = points_gdf.set_crs('epsg:4326')
points_gdf.plot(aspect='equal')
print(points_gdf.crs)
epsg:4326
No description has been provided for this image

Its seems the coordinates are inverted, let's fix this¶

In [ ]:
# Convert longitude and latitude to a geometric Point object
points_gdf = gp.GeoDataFrame(df_filtered, geometry=gp.points_from_xy(df_filtered.long, df_filtered.lat))
# Convert DataFrame to GeoDataFrame
points_gdf = points_gdf.set_crs('epsg:4326').to_crs('epsg:27561')
points_gdf.plot()
print(points_gdf.crs)
epsg:27561
No description has been provided for this image

Check columns¶

In [ ]:
points_gdf.head()
Out[ ]:
Num_Acc date an mois jour hrmn dep com lat long ... equipement obs obsm choc manv vehiculeid typevehicules manoeuvehicules numVehicules geometry
0 200500000030 2005-01-13 2005 janvier jeudi 19:45 62 62331 50.300 2.840 ... 0 0.0 2.0 8.0 11.0 200500000030B02 18 17 1.0 POINT (635873.910 289101.859)
3 200500000093 2005-01-03 2005 janvier lundi 13:30 02 02810 49.255 3.094 ... 0 0.0 2.0 3.0 21.0 200500000093B02 7 21 1.0 POINT (655137.321 173039.590)
6 200500000260 2005-01-04 2005 janvier mardi 16:45 22 22143 48.590 -2.300 ... 0 0.0 2.0 1.0 5.0 200500000260B02 7 1 1.0 POINT (258175.124 109337.357)
10 200500000291 2005-01-24 2005 janvier lundi 17:30 56 56155 47.485 -2.467 ... 2 0.0 2.0 3.0 15.0 200500000291B02 7 1 1.0 POINT (238049.201 -12534.885)
14 200500000329 2005-01-04 2005 janvier mardi 19:00 50 50213 49.086 -1.092 ... 0 0.0 2.0 4.0 13.0 200500000329B02 7 2 1.0 POINT (349671.489 159668.389)

5 rows × 40 columns

Plot France department limits and bike accident points¶

In [ ]:
france_gdf = gp.read_file("../data/france_dep.geojson")
france_gdf = france_gdf.to_crs('epsg:27561')

ax = points_gdf.plot(marker='o', color='red', markersize=5,aspect=1)
france_gdf.plot(ax=ax, color='white', edgecolor='black')
plt.show()
No description has been provided for this image

Let's grab only the points inside France and plot bicycle accident points together¶

In [ ]:
points_within = gp.sjoin(points_gdf,france_gdf,predicate='within')
In [ ]:
ax = points_within.plot(legend=True, markersize=3,alpha=0.5,figsize=(12,12))
france_gdf.plot(ax=ax, color='white', alpha=0.5, edgecolor="k")
plt.show()
No description has been provided for this image

Checking if the gravity of the accident can be spatially explained¶

In [ ]:
fig, axs = plt.subplots(2, 2)


points_within_grav_1 = points_within.loc[points_within['grav'] == 1]
points_within_grav_1.plot(ax=axs[0, 0],markersize=3,alpha=0.5,color='green')
axs[0, 0].set_title("Low")

points_within_grav_2 = points_within.loc[points_within['grav'] == 2]
points_within_grav_2.plot(ax=axs[0, 1],markersize=3,alpha=0.5,color='yellow')
axs[0, 1].set_title("Mild")

axs[1, 0].sharex(axs[0, 0])

points_within_grav_3 = points_within.loc[points_within['grav'] == 3]
points_within_grav_3.plot(ax=axs[1, 0],markersize=3,alpha=0.5,color='orange')
axs[1, 0].set_title("High")

points_within_grav_4 = points_within.loc[points_within['grav'] == 4]
points_within_grav_4.plot(ax=axs[1, 1],markersize=3,alpha=0.5,color='red')
axs[1, 1].set_title("Very high")

fig.tight_layout()
No description has been provided for this image

It seems that the accidents of types are well distributed across the country and concentrated on major cities. Also, it may indicate that the gravity of an accident is subjective.¶

Lets check the density of accidents (regardless of it's gravity) by region population¶

In [ ]:
accidents_by_department = points_gdf.sjoin(france_gdf,predicate='within')
accidents_by_department.plot()
Out[ ]:
<Axes: >
No description has been provided for this image

Plot accidents by gravity and by department¶

In [ ]:
accidents_by_region = accidents_by_department.dissolve(by='nom', aggfunc='sum',numeric_only=False)

# Remove index field names
france_gdf=france_gdf.loc[:,~france_gdf.columns.str.startswith('index_')]
accidents_by_region=accidents_by_region.loc[:,~accidents_by_region.columns.str.startswith('index_')]

departments_by_accidents = gp.sjoin(france_gdf,accidents_by_region,how="inner",predicate="intersects")

cmap = colors.LinearSegmentedColormap.from_list("", ["green","yellow","orange","red"])

departments_by_accidents.plot(column = 'grav', scheme='quantiles', cmap=cmap,figsize=(10,10),legend=True);
plt.title("Number of accidents by gravity and department")
plt.show()
/home/paulo/Projects/bike_risk_map/bike_env/lib/python3.10/site-packages/geopandas/geodataframe.py:1780: FutureWarning: Dropping invalid columns in DataFrameGroupBy.sum is deprecated. In a future version, a TypeError will be raised. Before calling .sum, select only columns which should be valid for the function.
  aggregated_data = data.groupby(**groupby_kwargs).agg(aggfunc, **kwargs)
No description has been provided for this image
In [ ]:
departments_by_accidents[["nom","grav"]].sort_values("grav",ascending=False)
Out[ ]:
nom grav
36 Paris 17513
82 Maine-et-Loire 3482
5 Ille-et-Vilaine 3112
33 Pyrénées-Atlantiques 2461
54 Gironde 2376
... ... ...
81 Lot 158
48 Corse-du-Sud 151
30 Lozère 118
95 Territoire de Belfort 113
50 Creuse 76

96 rows × 2 columns

By using a quantilles representation, we notice that touristic zones have a high number of accidents. But Paris, on absolute numbers, is still way higher than other departments¶

Removing spaces and upper cases from cols of both dataframes¶

In [ ]:
accidents_by_region.columns = [x.lower().replace(' ','') for x in accidents_by_region.columns]
accidents_by_department.columns = [x.lower().replace(' ','') for x in accidents_by_department.columns]

Calculate the occurences of accidents by region and add this info to the geodataframe with polygons¶

In [ ]:
num_accidents_per_region = pd.DataFrame({'total':accidents_by_department['nom'].value_counts()}).reset_index().rename(columns={'index': 'nom'})
num_accidents_per_region
Out[ ]:
nom total
0 Paris 5001
1 Maine-et-Loire 994
2 Ille-et-Vilaine 892
3 Gironde 704
4 Pyrénées-Atlantiques 703
... ... ...
91 Lot 56
92 Corse-du-Sud 49
93 Territoire de Belfort 38
94 Lozère 37
95 Creuse 24

96 rows × 2 columns

In [ ]:
# Merge
accidents_by_region_and_name = accidents_by_department.merge(num_accidents_per_region, on='nom')
region_by_accidents = france_gdf.merge(num_accidents_per_region, on='nom')
In [ ]:
accidents_by_region_and_name.plot(column='grav')
Out[ ]:
<Axes: >
No description has been provided for this image
In [ ]:
region_by_accidents.plot(column='total',cmap=cmap,scheme='equalinterval',legend=True,figsize=(10,10))
#boxplot', 'equalinterval', 'fisherjenks', 'fisherjenkssampled', 'headtailbreaks', 'jenkscaspall', 'jenkscaspallforced', 
#'jenkscaspallsampled', 'maxp', 'maximumbreaks', 'naturalbreaks', 'quantiles', 'percentiles', 'prettybreaks', 'stdmean', 'userdefined'
Out[ ]:
<Axes: >
No description has been provided for this image

By using equal intervals, the total number of accidents in Paris is way higher than in most regions of france. Let's make the same plot by using region population instead¶

In [ ]:
population = pd.read_excel('../data/TCRD_004.xlsx',index_col=[0])
population_filtered = population[['nom','2023 (p)']].copy().rename(columns = {'2023 (p)':'pop_2023'})
population_filtered
Out[ ]:
nom pop_2023
01 Ain 671937
02 Aisne 522791
03 Allier 332443
04 Alpes-de-Haute-Provence 166654
05 Hautes-Alpes 139942
... ... ...
91 Essonne 1316053
92 Hauts-de-Seine 1642002
93 Seine-Saint-Denis 1682806
94 Val-de-Marne 1426748
95 Val-d'Oise 1274374

96 rows × 2 columns

In [ ]:
region_by_accidents_pop2023 = region_by_accidents.merge(population_filtered,on='nom')
region_by_accidents_pop2023['acc_per_hab'] = (region_by_accidents_pop2023['total'] / region_by_accidents_pop2023['pop_2023']) * 1000
In [ ]:
region_by_accidents_pop2023.sort_values(by=['acc_per_hab'],ascending=[False]).head(10)
Out[ ]:
code nom geometry total pop_2023 acc_per_hab
36 75 Paris POLYGON ((598781.702 133337.696, 603566.179 13... 5001 2102650 2.378427
21 05 Hautes-Alpes POLYGON ((909410.352 -278534.727, 910653.954 -... 170 139942 1.214789
82 49 Maine-et-Loire POLYGON ((331522.762 14753.942, 332270.240 184... 994 828269 1.200093
60 65 Hautes-Pyrénées MULTIPOLYGON (((400774.858 -493563.673, 399221... 264 230583 1.144924
33 64 Pyrénées-Atlantiques POLYGON ((390674.278 -454989.449, 391301.192 -... 703 697899 1.007309
55 36 Indre POLYGON ((523410.499 -56802.253, 524321.781 -5... 193 214914 0.898034
16 68 Haut-Rhin POLYGON ((960414.264 79357.239, 962829.090 792... 653 769231 0.848900
84 56 Morbihan MULTIPOLYGON (((205667.370 -26249.380, 206759.... 641 777383 0.824561
25 17 Charente-Maritime MULTIPOLYGON (((305094.968 -158476.033, 306205... 547 665904 0.821440
5 35 Ille-et-Vilaine MULTIPOLYGON (((279606.174 105397.241, 276566.... 892 1118600 0.797425
In [ ]:
region_by_accidents_pop2023.plot(column='acc_per_hab',scheme='equalinterval',k=3,cmap=cmap,legend=True,figsize=(10,10))
Out[ ]:
<Axes: >
No description has been provided for this image

Maybe the cyclable rods can provide more info¶

In [ ]:
pistes_cyclable = gp.read_file("../data/france-20230901.geojson")
pistes_cyclable = pistes_cyclable.to_crs('epsg:27561')
pistes_cyclable.plot()
Out[ ]:
<Axes: >
No description has been provided for this image
In [ ]:
pistes_cyclable.head(1)
Out[ ]:
id_local id_osm num_iti code_com_d ame_d regime_d sens_d largeur_d local_d statut_d ... revet_g access_ame date_maj trafic_vit lumiere d_service source project_c ref_geo geometry
0 geovelo_967104105_65226 967104105 NaN 65226 VOIE VERTE AUTRE UNIDIRECTIONNEL NaN NaN EN SERVICE ... NaN NaN 2021-07-25 5.0 NaN NaN Les contributeurs OpenStreetmap 4326 OpenStreetmap LINESTRING (411941.578 -496613.093, 411959.247...

1 rows × 28 columns

Check some colums and unique values¶

In [ ]:
ame = pd.Series(u for u in pistes_cyclable['ame_d'].unique())
sens_d = pd.Series(u for u in pistes_cyclable['sens_d'].unique())
revet_g = pd.Series(u for u in pistes_cyclable['revet_g'].unique())
access_ame = pd.Series(u for u in pistes_cyclable['access_ame'].unique())
lumiere = pd.Series(u for u in pistes_cyclable['lumiere'].unique())
In [ ]:
df_unique = pd.DataFrame({'track_type': ame, 'track_direction':sens_d,'track_material':revet_g,'track_acess':access_ame,'lumiere':lumiere})
df_unique
Out[ ]:
track_type track_direction track_material track_acess lumiere
0 VOIE VERTE UNIDIRECTIONNEL NaN NaN NaN
1 AUTRE BIDIRECTIONNEL LISSE VELO DE ROUTE True
2 AUCUN NaN RUGUEUX ROLLER False
3 AMENAGEMENT MIXTE PIETON VELO HORS VOIE VERTE NaN MEUBLE VTC NaN
4 BANDE CYCLABLE NaN NaN VTT NaN
5 PISTE CYCLABLE NaN NaN NaN NaN
6 COULOIR BUS+VELO NaN NaN NaN NaN
7 GOULOTTE NaN NaN NaN NaN
8 CHAUSSEE A VOIE CENTRALE BANALISEE NaN NaN NaN NaN
9 ACCOTEMENT REVETU HORS CVCB NaN NaN NaN NaN
10 DOUBLE SENS CYCLABLE BANDE NaN NaN NaN NaN
11 VELO RUE NaN NaN NaN NaN
12 DOUBLE SENS CYCLABLE PISTE NaN NaN NaN NaN
13 NaN NaN NaN NaN NaN

Plot proportions of NA for each col¶

In [ ]:
null_track_type_freq = pistes_cyclable['ame_d'].isnull().sum();
not_null_track_type_freq = pistes_cyclable['ame_d'].notnull().sum();

null_track_direction_freq = pistes_cyclable['sens_d'].isnull().sum();
not_null_track_direction_freq = pistes_cyclable['sens_d'].notnull().sum();

null_track_material_freq = pistes_cyclable['revet_g'].isnull().sum();
not_null_track_material_freq = pistes_cyclable['revet_g'].notnull().sum();

null_track_acess_freq = pistes_cyclable['access_ame'].isnull().sum();
not_null_track_acess_freq = pistes_cyclable['access_ame'].notnull().sum();

null_lumiere_freq = pistes_cyclable['lumiere'].isnull().sum();
not_null_lumiere_freq = pistes_cyclable['lumiere'].notnull().sum();

stats = pd.DataFrame({'Col' : ['Track type','Track dir','Track material','Track acess','Lumiere'],
                     'NA': [null_track_type_freq,null_track_direction_freq,null_track_material_freq,null_track_acess_freq,null_lumiere_freq],
                     'Not NA': [not_null_track_type_freq,not_null_track_direction_freq,not_null_track_material_freq,not_null_track_acess_freq,not_null_lumiere_freq]})

stats.set_index('Col')

stats
Out[ ]:
Col NA Not NA
0 Track type 13 293803
1 Track dir 61 293755
2 Track material 110912 182904
3 Track acess 260307 33509
4 Lumiere 223554 70262

Since there are too many null values regarding track material, acess type and presence of ligtht, let's explore track type and track direction¶

In [ ]:
# Starting by track type

pistes_cyclable_type_not_na = pistes_cyclable.loc[pistes_cyclable['ame_d'].notnull()]
print("Number of track types with null values : ", pistes_cyclable_type_not_na['ame_d'].isnull().sum())
Number of track types with null values :  0

Visuzalize the top 5 departments by accidents and extract the first one¶

In [ ]:
top5 = region_by_accidents_pop2023.sort_values(by=['acc_per_hab'],ascending=[False]).head(5)
print(top5[['nom','pop_2023']])
paris = top5.head(1)
paris
                     nom  pop_2023
36                 Paris   2102650
21          Hautes-Alpes    139942
82        Maine-et-Loire    828269
60       Hautes-Pyrénées    230583
33  Pyrénées-Atlantiques    697899
Out[ ]:
code nom geometry total pop_2023 acc_per_hab
36 75 Paris POLYGON ((598781.702 133337.696, 603566.179 13... 5001 2102650 2.378427
In [ ]:
ax = paris.plot(color="white", edgecolor="black", figsize=(20, 10))

# Drop cols with index_ suffix
accidents_by_region_and_name=accidents_by_region_and_name.loc[:,~accidents_by_region_and_name.columns.str.startswith('index_')]
# Intersect tracks with paris
region_intersect_track_type = gp.sjoin(pistes_cyclable_type_not_na, paris, predicate='intersects')
# Intersect accidents with paris
accidents_intersect_type = gp.sjoin(accidents_by_region_and_name, paris, predicate='within')
# Define a colormap
region_intersect_cmap = colors.LinearSegmentedColormap.from_list(region_intersect_track_type["ame_d"].unique(), list(reversed(["green","yellow","orange","red"])))

region_intersect_track_type.plot(ax=ax,cmap='turbo',column='ame_d',legend=True)
accidents_intersect_type.plot(ax=ax,color="blue",legend=True,alpha=0.5)
Out[ ]:
<Axes: >
No description has been provided for this image

There doesnt seem to have a direct relation between accidents and track type. Lets see about the track direction¶

In [ ]:
# Starting by track type

pistes_cyclable_direction_not_na = pistes_cyclable.loc[pistes_cyclable['ame_d'].notnull()]
print("Number of track types with null values : ", pistes_cyclable_direction_not_na['ame_d'].isnull().sum())
Number of track types with null values :  0
In [ ]:
# Intersect tracks with paris

ax = paris.plot(color="white", edgecolor="black",alpha=0.5, figsize=(20, 10))

region_intersect_track_direction = gp.sjoin(pistes_cyclable_direction_not_na, paris, predicate='intersects')
# Intersect accidents with paris
accidents_intersect_track_direction = gp.sjoin(accidents_by_region_and_name, paris, predicate='within')
# Define a colormap
region_intersect_cmap = colors.LinearSegmentedColormap.from_list(region_intersect_track_direction["sens_d"].unique(), list(reversed(["green","red"])))

region_intersect_track_direction.plot(ax=ax,column='sens_d',cmap=region_intersect_cmap,legend=True)
accidents_intersect_track_direction.plot(ax=ax,column="age",color='blue',legend=True,alpha=0.5)
/home/paulo/Projects/bike_risk_map/bike_env/lib/python3.10/site-packages/geopandas/plotting.py:658: UserWarning: Only specify one of 'column' or 'color'. Using 'color'.
  warnings.warn(
Out[ ]:
<Axes: >
No description has been provided for this image

Accidents by period of the year¶

In [ ]:
months = list(accidents_intersect_track_direction['mois'].unique()).sort
months_order = ['janvier', 'février', 'mars', 'avril',
          'mai', 'juin', 'juillet', 'août',
          'septembre', 'octobre', 'novembre', 'décembre']
In [ ]:
accidents_intersect_track_direction_grouped = accidents_intersect_track_direction.sort_values('mois', ascending=True).groupby('mois').size().reset_index(name ='Accidents')

accidents_intersect_track_direction_sorted_grouped = accidents_intersect_track_direction_grouped.sort_values('mois', key=lambda s: s.apply(months_order.index), ignore_index=True)
accidents_intersect_track_direction_sorted_grouped
Out[ ]:
mois Accidents
0 janvier 359
1 février 319
2 mars 409
3 avril 391
4 mai 452
5 juin 623
6 juillet 519
7 août 295
8 septembre 507
9 octobre 449
10 novembre 360
11 décembre 318
In [ ]:
import seaborn as sns
sns.set_theme()

fig, ax = plt.subplots(figsize=(12, 6))
sns.lineplot(data=accidents_intersect_track_direction_sorted_grouped, x="mois",y='Accidents').set_title("Accidents per month in paris")
ax.tick_params(axis='x', labelrotation = 45)
plt.show()
No description has been provided for this image

It's cleaar that population has a hight weight on the bicycle , since during the month June and July, we have a incresed number of tourists in Paris. The month of September, is the first after vaccation in France¶

Let's make cloroplets, using arrondissement zones in paris¶

In [ ]:
import geoplot as gplt


# Read polygons of arrondissement
arrondis_gdf = gp.read_file("../data/arrondissements.geojson")
arrondis_gdf = arrondis_gdf.to_crs('epsg:27561')

# Drop cols with index_ suffix
#paris_accidents = paris_accidents.loc[:,~paris_accidents.columns.str.startswith('index_')]

# Extract only accidents in paris (by name)
paris_accidents = accidents_by_department.loc[accidents_by_department['nom'] == 'Paris']

# Merge paris accidents with population data and remove cols with index_ (from merge result)
paris_accidents_by_pop2023 = paris_accidents.merge(population_filtered,on='nom')
paris_accidents_by_pop2023 = paris_accidents_by_pop2023.loc[:,~paris_accidents_by_pop2023.columns.str.startswith('index_')]

# Spatial join between arrondissement (polygon) and  accidents in paris (points), using 'intersects predicate
paris_arrondis_accidents_by_pop2023 = gp.sjoin(arrondis_gdf, paris_accidents_by_pop2023, predicate='intersects')

# Group by arronsidement names and merge with popsulation
population_by_arrondissement = paris_arrondis_accidents_by_pop2023.groupby('l_ar').size().reset_index(name='num_accidents')
paris_accidents_by_pop2023_arrondi = paris_arrondis_accidents_by_pop2023.merge(population_by_arrondissement,on='l_ar')

# Ploting result
ax = arrondis_gdf.plot(color="white",edgecolor="black",alpha=0.5, figsize=(20, 12))
paris_accidents_by_pop2023_arrondi.plot(ax=ax,column='num_accidents',cmap=region_intersect_cmap.reversed(),legend=True)

# Plot arrondissement labels based on polygons centroids
paris_accidents_by_pop2023_arrondi.apply(lambda x: ax.annotate(text= x['l_ar'].split(' ')[0], xy=x.geometry.centroid.coords[0], ha='center'), axis=1);
ax.set_title('Number of accidents by arrondissement', fontsize=13);
No description has been provided for this image

And plot data¶

In [ ]:
# Order by accident
population_by_arrondissement_ordered = population_by_arrondissement.sort_values('num_accidents')
# Split X axis labels to extract first word
labels = list(population_by_arrondissement_ordered['l_ar'].apply(lambda x: x.split(' ')[0]))

# Define an axis with plot and its parameters
ax = population_by_arrondissement.sort_values('num_accidents').plot(kind='bar',x='l_ar',
                                                               xlabel="Arrondisement",
                                                               ylabel="Number of accidents")
ax.set_xticklabels(labels,rotation=45)

# Plot
plt.show()
No description has been provided for this image

They are not so representative, since we cant see where are the zones where accidents occur more ofter. Lets make a grid. A greate function can be found here : https://pygis.io/docs/e_summarize_vector.html¶

In [ ]:
import math
import numpy as np
from shapely.geometry import Polygon, box

def create_grid(feature, shape, side_length):
    '''Create a grid consisting of either rectangles or hexagons with a specified side length that covers the extent of input feature.'''

    # Slightly displace the minimum and maximum values of the feature extent by creating a buffer
    # This decreases likelihood that a feature will fall directly on a cell boundary (in between two cells)
    # Buffer is projection dependent (due to units)
    feature = feature.buffer(20)

    # Get extent of buffered input feature
    min_x, min_y, max_x, max_y = feature.total_bounds

    # Shape area
    area = 0


    # Create empty list to hold individual cells that will make up the grid
    cells_list = []

    # Create grid of squares if specified
    if shape in ["square", "rectangle", "box"]:

        # Adapted from https://james-brennan.github.io/posts/fast_gridding_geopandas/
        # Create and iterate through list of x values that will define column positions with specified side length
        for x in np.arange(min_x - side_length, max_x + side_length, side_length):

            # Create and iterate through list of y values that will define row positions with specified side length
            for y in np.arange(min_y - side_length, max_y + side_length, side_length):

                # Create a box with specified side length and append to list
                cells_list.append(box(x, y, x + side_length, y + side_length))
        est = (max_x - min_x) / length(cells_list)
        north = (max_y - min_y) / length(cells_list)
        area = (est * north)

    # Otherwise, create grid of hexagons
    elif shape == "hexagon":

        # Set horizontal displacement that will define column positions with specified side length (based on normal hexagon)
        x_step = 1.5 * side_length

        # Set vertical displacement that will define row positions with specified side length (based on normal hexagon)
        # This is the distance between the centers of two hexagons stacked on top of each other (vertically)
        y_step = math.sqrt(3) * side_length

        # Get apothem (distance between center and midpoint of a side, based on normal hexagon)
        apothem = (math.sqrt(3) * side_length / 2)

        # Set column number
        column_number = 0

        # Create and iterate through list of x values that will define column positions with vertical displacement
        for x in np.arange(min_x, max_x + x_step, x_step):

            # Create and iterate through list of y values that will define column positions with horizontal displacement
            for y in np.arange(min_y, max_y + y_step, y_step):

                # Create hexagon with specified side length
                hexagon = [[x + math.cos(math.radians(angle)) * side_length, y + math.sin(math.radians(angle)) * side_length] for angle in range(0, 360, 60)]

                # Append hexagon to list
                cells_list.append(Polygon(hexagon))

            # Check if column number is even
            if column_number % 2 == 0:

                # If even, expand minimum and maximum y values by apothem value to vertically displace next row
                # Expand values so as to not miss any features near the feature extent
                min_y -= apothem
                max_y += apothem

            # Else, odd
            else:

                # Revert minimum and maximum y values back to original
                min_y += apothem
                max_y -= apothem

            # Increase column number by 1
            column_number += 1
        area  = (3 * math.sqrt(3) * pow(side_length,2)) / 2

    # Else, raise error
    else:
        raise Exception("Specify a rectangle or hexagon as the grid shape.")

    # Create grid from list of cells
    grid = gp.GeoDataFrame(cells_list, columns = ['geometry'], crs = "epsg:27561")

    # Create a column that assigns each grid a number
    grid["Grid_ID"] = np.arange(len(grid))

    # Return grid
    return grid,area

Create hexagons¶

In [ ]:
# Create heaxagon
area_grid,area = create_grid(feature = paris_arrondis_accidents_by_pop2023, shape = 'hexagon', side_length = 200)
#cell_grid["cell_id"] = cell_grid.index + 1
#cell_grid.head(5)
area_grid.plot()
Out[ ]:
<Axes: >
No description has been provided for this image

Remove cells outside Paris¶

In [ ]:
area_grid_paris = gp.sjoin(area_grid, arrondis_gdf, how='inner', predicate='intersects')
# Remove fields from spatial join
area_grid_paris = area_grid_paris.loc[:,~area_grid_paris.columns.str.startswith('index_')]
area_grid_paris.reset_index(inplace=True)
area_grid_paris.plot()
Out[ ]:
<Axes: >
No description has been provided for this image

Later, we fuse the cells with the accidents layer¶

In [ ]:
## Remove index columns from previous merge operations, if necessary

paris_accidents = paris_accidents.loc[:,~paris_accidents.columns.str.startswith('index_')]
paris_accidents_by_pop2023_arrondi = paris_accidents_by_pop2023_arrondi.loc[:,~paris_accidents_by_pop2023_arrondi.columns.str.startswith('index_')]
print("Dataframes have columns with name index ? :", any(paris_accidents.columns.str.startswith('index_')) 
                                                      and any(paris_accidents_by_pop2023_arrondi.columns.str.startswith('index_')))
Dataframes have columns with name index ? : False
In [ ]:
# Grab all dataset of accidents, within paris
accidents_paris = gp.sjoin(accidents_by_region_and_name, paris_accidents_by_pop2023_arrondi, how='inner', predicate='within')
accidents_paris = accidents_paris.loc[:,~accidents_paris.columns.str.startswith('index_')]
accidents_paris.columns = accidents_paris.columns.str.rstrip("_left")
accidents_paris.columns = accidents_paris.columns.str.rstrip("_right")

accidents_paris.plot()
Out[ ]:
<Axes: >
No description has been provided for this image
In [ ]:
accidents_paris.head()
Out[ ]:
num_acc da an mois jou hrmn dep com la lon ... choc manv vehiculeid typevehicules manoeuvehicules numvehicules code nom pop_2023 num_accidents
25055 201800041647 2018-01-13 2018 janvier samedi 21:10 94 94069 48.817260 2.458920 ... 0.0 1.0 202100048294A01 NaN NaN NaN 75 Paris 2102650 375
25118 201800054676 2018-03-06 2018 mars mardi 95:0 75 75120 48.846600 2.415980 ... 0.0 1.0 202100048294A01 NaN NaN NaN 75 Paris 2102650 375
25122 201900035012 2019-06-23 2019 juin dimanche 08:10 75 75112 48.844343 2.441665 ... 0.0 1.0 202100048294A01 NaN NaN NaN 75 Paris 2102650 375
25143 202000019957 2020-12-19 2020 décembre samedi 17:15 94 94033 48.844350 2.450700 ... 0.0 1.0 202100048294A01 NaN NaN NaN 75 Paris 2102650 375
25148 202100021295 2021-09-02 2021 septembre jeudi 08:45 75 75112 48.818409 2.451584 ... 0.0 1.0 202100048294A01 NaN NaN NaN 75 Paris 2102650 375

5 rows × 95 columns

Aggreate accidents by cell grid¶

In [ ]:
#######
# Perform spatial join, merging attribute table of wells point and that of the cell with which it intersects
# op = "intersects" also counts those that fall on a cell boundary (between two cells)
# op = "within" will not count those fall on a cell boundary

points_within = points_within.loc[:,~points_within.columns.str.startswith('index_')]

# Merging accidents data with grid data by spatial intersection boundary
grid_accidents = gp.sjoin(points_within, area_grid_paris, how='left', predicate='within')

# Add a field with constant value of 1
grid_accidents['n_acc'] = 1

# Compute stats per grid cell -- aggregate fires to grid cells with dissolve
dissolve = grid_accidents.dissolve(by="index_right", aggfunc="count")

# put this into cell
area_grid_paris.loc[dissolve.index, 'n_acc'] = dissolve.n_acc.values

# Fill the NaN values (cells without any points) with 0 if we want to see
area_grid_paris['n_acc'] = area_grid_paris['n_acc'].fillna(0)
#cell_grid = cell_grid.within(paris_accidents_by_pop2023_arrondi)]
In [ ]:
# Plot data
ax = paris_accidents_by_pop2023_arrondi.plot(markersize=.1, figsize=(15, 10),color="None",edgecolor="red",legend=True)
legend_intervals = [int(area_grid_paris["n_acc"].min()),5,10,15,int(area_grid_paris["n_acc"].max())]
accidents_paris.plot(ax = ax,marker = 'o', color = 'dimgray', markersize = 3)
area_grid_paris.plot(ax = ax, column = "n_acc", 
                cmap=cmap,edgecolor="lightseagreen", linewidth = 0.5, alpha = 0.8,legend = True,
                legend_kwds={
                    "shrink":.68,
                    "format": "%g",
                    'label': "Accidents",
                    "pad": 0.01,
                    #"ticks" : legend_intervals
                })
# Set title
ax.set_title(f'Grid of accidents per ±{area:.0f} m2', fontdict = {'fontsize': '15', 'fontweight' : '3'})
plt.show()
No description has been provided for this image

Lets test some interpolation methods to fill the empty cells and get better results¶

Give a small subset of points to train on¶

In [ ]:
def f(x):
    """Function to be approximated by polynomial interpolation."""
    return x * np.sin(x)
In [ ]:
samples = area_grid_paris["n_acc"].to_list()

# whole range we want to plot
x_plot = np.linspace(min(samples), max(samples), len(samples))

# To make it interesting, we only give a small subset of points to train on.
x_train =  samples
rng = np.random.RandomState(0)
x_train = np.sort(rng.choice(x_train, size=10, replace=False))
y_train = f(x_train)

# create 2D-array versions of these arrays to feed to transformers
X_train = x_train[:, np.newaxis]
X_plot = x_plot[:, np.newaxis]
In [ ]:
from sklearn.linear_model import Ridge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures, SplineTransformer

# plot function
lw = 2
fig, ax = plt.subplots()
ax.set_prop_cycle(
    color=["black", "teal", "yellowgreen", "gold", "darkorange", "tomato"]
)
ax.plot(x_plot, f(x_plot), linewidth=lw, label="ground truth")

# plot training points
ax.scatter(x_train, y_train, label="training points")

# polynomial features
for degree in [3, 4, 5]:
    model = make_pipeline(PolynomialFeatures(degree), Ridge(alpha=1e-3))
    model.fit(X_train, y_train)
    y_plot = model.predict(X_plot)
    ax.plot(x_plot, y_plot, label=f"degree {degree}")

# B-spline with 4 + 3 - 1 = 6 basis functions
model = make_pipeline(SplineTransformer(n_knots=4, degree=3), Ridge(alpha=1e-3))
model.fit(X_train, y_train)

y_plot = model.predict(X_plot)
ax.plot(x_plot, y_plot, label="B-spline")
ax.legend(loc="lower center")
ax.set_ylim(-40, 40)
plt.show()
No description has been provided for this image

Splines make a good job on fitting well the data, and provide the same time, some paramenters to control extrapolation. However, seasonal effects may cut an expected periodic continuation of the underlying signal. Periodic splines could be used in such case¶

In [ ]:
def g(x):
    """Function to be approximated by periodic spline interpolation."""
    return np.sin(x) - 0.7 * np.cos(x * 3)
In [ ]:
y_train = g(x_train)

# Extend the test data into the future:
x_plot_ext = np.linspace(min(samples), max(samples) + 10, len(samples) + 100)
X_plot_ext = x_plot_ext[:, np.newaxis]

lw = 2
fig, ax = plt.subplots()
ax.set_prop_cycle(color=["black", "tomato", "teal"])
ax.plot(x_plot_ext, g(x_plot_ext), linewidth=lw, label="ground truth")
ax.scatter(x_train, y_train, label="training points")

for transformer, label in [
    (SplineTransformer(degree=3, n_knots=10), "spline"),
    (
        SplineTransformer(
            degree=3,
            knots=np.linspace(0, 2 * np.pi, 10)[:, None],
            extrapolation="periodic",
        ),
        "periodic spline",
    ),
]:
    model = make_pipeline(transformer, Ridge(alpha=1e-3))
    model.fit(X_train, y_train)
    y_plot_ext = model.predict(X_plot_ext)
    ax.plot(x_plot_ext, y_plot_ext, label=label)

ax.legend()
fig.show()
/tmp/ipykernel_13264/4214459259.py:30: UserWarning: Matplotlib is currently using module://matplotlib_inline.backend_inline, which is a non-GUI backend, so cannot show the figure.
  fig.show()
No description has been provided for this image
In [ ]:
sns.displot(samples,kind="kde")
plt.title("Number of accidents per cell grid in Paris")
/home/paulo/Projects/bike_risk_map/bike_env/lib/python3.10/site-packages/seaborn/axisgrid.py:118: UserWarning: The figure layout has changed to tight
  self._figure.tight_layout(*args, **kwargs)
Out[ ]:
Text(0.5, 1.0, 'Number of accidents per cell grid in Paris')
No description has been provided for this image
In [ ]: